{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-04-15 19:19:34--  https://s3.amazonaws.com/text-datasets/imdb_full.pkl\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.112.189\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.112.189|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 65552540 (63M) [application/octet-stream]\n",
      "Saving to: ‘imdb_full.pkl’\n",
      "\n",
      "imdb_full.pkl       100%[===================>]  62.52M  81.2MB/s    in 0.8s    \n",
      "\n",
      "2019-04-15 19:19:35 (81.2 MB/s) - ‘imdb_full.pkl’ saved [65552540/65552540]\n",
      "\n",
      "--2019-04-15 19:19:35--  https://s3.amazonaws.com/text-datasets/imdb_word_index.json\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.112.189\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.112.189|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1641221 (1.6M) [application/octet-stream]\n",
      "Saving to: ‘imdb_word_index.json’\n",
      "\n",
      "imdb_word_index.jso 100%[===================>]   1.56M  --.-KB/s    in 0.07s   \n",
      "\n",
      "2019-04-15 19:19:35 (21.4 MB/s) - ‘imdb_word_index.json’ saved [1641221/1641221]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://s3.amazonaws.com/text-datasets/imdb_full.pkl\n",
    "!wget https://s3.amazonaws.com/text-datasets/imdb_word_index.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import pickle\n",
    "data = pickle.load(open('imdb_full.pkl', 'rb'))\n",
    "\n",
    "import json\n",
    "vocab = json.load(open('imdb_word_index.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "inv = {idx:word for word, idx in vocab.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "(X_train, y_train), (Xt, yt) = data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainidx = [i for i, x in enumerate(X_train) if len(x) < 400]\n",
    "trainidx, devidx = train_test_split(trainidx, train_size=0.8, random_state=1378)\n",
    "X = [X_train[i] for i in trainidx]\n",
    "y = [y_train[i] for i in trainidx]\n",
    "\n",
    "Xd = [X_train[i] for i in devidx]\n",
    "yd = [y_train[i] for i in devidx]\n",
    "\n",
    "testidx = [i for i, x in enumerate(Xt) if len(x) < 400]\n",
    "testidx, remaining_idx =  train_test_split(testidx, train_size=0.2, random_state=1378)\n",
    "\n",
    "Xt = [Xt[i] for i in testidx]\n",
    "yt = [yt[i] for i in testidx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def invert_and_join(X) :\n",
    "    X = [[inv[x] for x in doc] for doc in X]\n",
    "    X = [\" \".join(x) for x in X]\n",
    "    return X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = invert_and_join(X)\n",
    "Xd = invert_and_join(Xd)\n",
    "Xt = invert_and_join(Xt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = {'train' : X, 'test' : Xt, 'dev' : Xd}\n",
    "labels = {'train' : y, 'test' : yt, 'dev' : yd}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df_texts = []\n",
    "df_labels = []\n",
    "df_exp_splits = []\n",
    "\n",
    "for key in ['train', 'test', 'dev'] :\n",
    "    df_texts += texts[key]\n",
    "    df_labels += labels[key]\n",
    "    df_exp_splits += [key] * len(texts[key])\n",
    "    \n",
    "df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_splits})\n",
    "df.to_csv('imdb_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size :  12487\n",
      "Found 11522 words in model out of 12487\n"
     ]
    }
   ],
   "source": [
    "%run \"../preprocess_data_BC.py\" --data_file imdb_dataset.csv --output_file ./vec_imdb.p --word_vectors_type fasttext.simple.300d --min_df 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
